In this dataset there is following books:
* Upanishads - are ancient Sanskrit texts of spiritual teaching and ideas of Hinduism. They are the part of the oldest scriptures of Hinduism, the Vedas, that deal with meditation, philosophy, and spiritual knowledge; other parts of the Vedas deal with mantras, benedictions, rituals, ceremonies, and sacrifices.
Yoga Sutras - collection of 196 Sanskrit sutras* (aphorisms) on the theory and practice of yoga.
Buddha Sutras - were initially passed on orally by monks, but were later written down and composed as manuscripts in various Indo-Aryan languages which were then translated into other local languages as Buddhism spread
Tao Te Ching - fundamental text for both philosophical and religious Taoism. It also strongly influenced other schools of Chinese philosophy and religion, including Legalism, Confucianism, and Buddhism, which was largely interpreted through the use of Taoist words and concepts when it was originally introduced to China.
Old Testament :
Book of Wisdom - Solomon’s speech concerning wisdom, wealth, power and prayer
Book of Proverbs - Proverbs is not merely an anthology but a “collection of collections” relating to a pattern of life which lasted for more than a millennium. It is an example of the Biblical wisdom tradition, and raises questions of values, moral behaviour, the meaning of human life, and right conduct.
Book of Ecclesiastes - is one of 24 books of the Tanakh (Hebrew Bible), where it is classified as one of the Ketuvim (Writings).
Book of Ecclesiasticus - commonly called the Wisdom of Sirach or simply Sirach.
More on wikipedia
**in Indian literary traditions refers to an aphorism or a collection of aphorisms in the form of a manual or, more broadly, a condensed manual or text. Sutras are a genre of ancient and medieval Indian texts found in Hinduism, Buddhism and Jainism.*
Quick look at the data size:
cat('Number of features:', ncol(data))
## Number of features: 8267
cat('Number of records:', nrow(data))
## Number of records: 590
cat('Number of words in total:', sum(data[-1]))
## Number of words in total: 60609
To give some perspective polish classic ‘Pan Tadeusz’ has 68 682 words in total.
knitr::kable(
data[1:10, 1:10], caption = 'Dataset',
booktabs = TRUE
) %>%
kable_styling()
| X | foolishness | hath | wholesome | takest | feelings | anger | vaivaswata | matrix | kindled |
|---|---|---|---|---|---|---|---|---|---|
| Buddhism_Ch1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Buddhism_Ch2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Buddhism_Ch3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Buddhism_Ch4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Buddhism_Ch5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Buddhism_Ch6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Buddhism_Ch7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Buddhism_Ch8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Buddhism_Ch9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| Buddhism_Ch10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Looking at data we are certain that there is no sense in keeping chapters separated. We than used stringi package to extract names of books. We figured that we combine biblical texts into one as they have significantly less chapters than the rest. Than we truncated it to have only one book per row (word occurances were summed). We ended up with this dataframe:
book_name <- stri_extract(data$X, regex = "^[a-zA-Z]+")
book_name <- ifelse(startsWith(book_name, "Bo"), "Bible",book_name)
data$book_name <- book_name
data <- data[,-1]
book_names <- unique(data$book_name)
df <- matrix(0, length(book_names), ncol = ncol(data)-1)
for (i in seq_along(book_names)){
row <- colSums(data[data$book_name == book_names[i],1:(ncol(data)-1)])
df[i,] <- row
}
df <- as.data.frame(df)
df <- cbind(book_names,df)
colnames(df) <- c( "book_name", colnames(data[,1:(ncol(data)-1)]))
m <- ncol(df)
knitr::kable(
df[1:5, 1:10], caption = 'Dataset',
booktabs = TRUE
) %>%
kable_styling()
| book_name | foolishness | hath | wholesome | takest | feelings | anger | vaivaswata | matrix | kindled |
|---|---|---|---|---|---|---|---|---|---|
| Buddhism | 0 | 0 | 0 | 0 | 19 | 0 | 0 | 0 | 0 |
| TaoTeChing | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| Upanishad | 0 | 0 | 0 | 0 | 0 | 3 | 1 | 0 | 1 |
| YogaSutra | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| Bible | 2 | 332 | 3 | 1 | 0 | 31 | 0 | 0 | 3 |
It is already better for visualization.
Most common words per book
for (bn in book_names){
tmp <- sort(df[df$book_name == bn, 2:m], decreasing = T)
barplot(height = unlist(tmp[10:1]),
las =2 ,
horiz = TRUE,
main = paste("Most frequent words in", bn),
cex.names=0.7,
col = "lightblue")
}
More interesting way to visualize words is word clouds
TeoTeChing
bn <- "TaoTeChing"
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq = tmp)
set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), main = bn)
Bible
bn <- "Bible"
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq = tmp)
set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), main = bn)
Buddhism
bn <- "Buddhism"
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq = tmp)
set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), main = bn)
Upnishad
bn <- "Upanishad"
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq = tmp)
set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
max.words=80, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), main = bn)
YogaSutra
bn <- "YogaSutra"
tmp <- unlist(df[df$book_name == bn, -1])
names(tmp) <- NULL
df2 <- data.frame(word = colnames(df[,-1]), freq = tmp)
set.seed(1234)
wordcloud(words = df2$word, freq = df2$freq, min.freq = 1,
max.words=100, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"), main = bn)
How our chapters look categorized in books look treated with TSNE
library(Rtsne)
library(ggplot2)
library(plotly)
tsne <- Rtsne(data[,1:8266], dims = 2, preplexity = 30, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 590 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.09 seconds (sparsity = 0.264752)!
## Learning embedding...
## Iteration 50: error is 60.478213 (50 iterations in 0.23 seconds)
## Iteration 100: error is 60.305248 (50 iterations in 0.27 seconds)
## Iteration 150: error is 59.533257 (50 iterations in 0.21 seconds)
## Iteration 200: error is 59.502624 (50 iterations in 0.19 seconds)
## Iteration 250: error is 59.494416 (50 iterations in 0.08 seconds)
## Iteration 300: error is 1.123140 (50 iterations in 0.07 seconds)
## Iteration 350: error is 1.055831 (50 iterations in 0.07 seconds)
## Iteration 400: error is 1.038577 (50 iterations in 0.07 seconds)
## Iteration 450: error is 1.030946 (50 iterations in 0.07 seconds)
## Iteration 500: error is 1.024539 (50 iterations in 0.07 seconds)
## Fitting performed in 1.33 seconds.
data_to_plot <- as.data.frame(tsne$Y)
data_to_plot$label <- book_name
ggplot(data_to_plot, aes(x = V1, y = V2, color = label)) +
geom_point() +
theme_bw() +
scale_color_manual(values = brewer.pal(8, "Set1"))
And how they look in 3d
tsne <- Rtsne(data[,1:8266], dims = 3, preplexity = 30, verbose=TRUE, max_iter = 500)
## Performing PCA
## Read the 590 x 50 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.10 seconds (sparsity = 0.264752)!
## Learning embedding...
## Iteration 50: error is 59.826617 (50 iterations in 0.44 seconds)
## Iteration 100: error is 59.752607 (50 iterations in 0.45 seconds)
## Iteration 150: error is 59.515524 (50 iterations in 0.26 seconds)
## Iteration 200: error is 59.504687 (50 iterations in 0.23 seconds)
## Iteration 250: error is 59.515569 (50 iterations in 0.20 seconds)
## Iteration 300: error is 0.977989 (50 iterations in 0.16 seconds)
## Iteration 350: error is 0.898545 (50 iterations in 0.15 seconds)
## Iteration 400: error is 0.872014 (50 iterations in 0.15 seconds)
## Iteration 450: error is 0.861557 (50 iterations in 0.15 seconds)
## Iteration 500: error is 0.854472 (50 iterations in 0.17 seconds)
## Fitting performed in 2.36 seconds.
data_to_plot <- as.data.frame(tsne$Y)
data_to_plot$label <- book_name
plot_ly(data_to_plot, x = ~V1, y = ~V2, z = ~V3, color = ~label, size = 0.1)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
word lengths
d <- data.frame(word_len = NULL, book = NULL)
for (bn in book_names){
tmp_df <- df[df$book_name == bn,]
word_list <- sapply(colnames(tmp_df)[2:ncol(tmp_df)], function(x) rep(nchar(x), tmp_df[x]) )
word_list <- unlist(word_list)
names(word_list) <- NULL
p <- data.frame(word_len = word_list, book = rep(bn, length(word_list)))
d <- rbind(d, p)
}
ggplot(d, aes(x = word_len, fill = book)) + geom_density(adjust = 2, alpha = 0.5 ) + theme_minimal()
ggplot(d, aes(y = word_len, x= book, fill = book)) + geom_boxplot() + theme_minimal()
The NRC Emotion Lexicon is a list of English words and their associations with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). The annotations were manually done by crowdsourcing.
get_words_out_of_bag <- function(df,n){
#' df - bag of words
#' n number of row to get words out
row <- df[n,] %>% select(which(df[n,] != 0))
cols <- colnames(row)
if(length(cols) == 1){return(list(NULL))}
words <- sapply(2:length(cols), function(i, row, cols){rep(cols[i], row[1,i])}, cols = cols, row=row, simplify = TRUE)
unlist(words)
}
get_words_in_books <- function(df, bookname){
slice <- df[df[,1] == bookname,]
all <- sapply(1:nrow(slice), get_words_out_of_bag, df=df, simplify = TRUE)
unlist(all)
}
parallelStartMulticore(detectCores())
## Starting parallelization in mode=multicore with cpus=8.
df <- read.csv('./AllBooks_baseline_DTM_Labelled.csv')
books <- stri_extract(df$X, regex = "^[a-zA-Z]+")
df$X <- books
starttime <- Sys.time()
inside <- parallelLapply(unique(books), get_words_in_books, df=df)
## Mapping in parallel: mode = multicore; level = NA; cpus = 8; elements = 8.
endtime <- Sys.time()
endtime - starttime
## Time difference of 57.16069 secs
select_sentiment <- function(word, sent_name){
#' fukncja zwraca liste posortowanych slow o danym sentymencie
#' @param word lista slow do analizy
#' @param sent_name nazwa sentymentu, ktory chcemy otrzymac
require('tidytext')
require('textdata')
# wynieramy slowa ze slownika sentymentu
nrc_sent <- get_sentiments("nrc") %>%
filter(sentiment == sent_name)
#zliczamy w danych wystapienia tych slow
as.data.frame(word) %>%
inner_join(nrc_sent) %>%
count(word, sort = TRUE) %>%
mutate(sent = sent_name)
}
all_sentiment <- function(word, k = 100, all = FALSE, sent = c("trust",
"fear",
"negative",
"sadness",
"anger",
"surprise",
"positive",
"disgust",
"anticipation",
"joy")){
#' zwraca k najbardziej emocjonalnych slow w zbiorze
#' @param word lista slow
#' @param k do head(k)
#' @param all jesli TRUE zwroci wszystkie slowa niezaleznie od k
#' @param sent zwroci tylko te sentymenty - domyslnie wszytkie
require('dplyr')
# wektoryzwujemy funkcje wyznaczajaca sentyment wzgledem sentymentu
vek_s <- Vectorize(select_sentiment, vectorize.args = "sent_name", SIMPLIFY = FALSE)
# lista macierzy kazdego sentymentu
x <- vek_s(word, sent)
# laczymy w jedna ramke
y <- do.call(rbind, x)
# sortujemy
y2 <- y %>% arrange(desc(n))
# obcinamy do k
if(!all) {
y2<- head(y2, k)
}
return(y2)
}
sentiment_table_mixed <- function(word, name, sent = c("trust",
"fear",
"negative",
"sadness",
"anger",
"surprise",
"positive",
"disgust",
"anticipation",
"joy")) {
#' zwraca jednokolumnowa tabelke z procentowa zawartoscia kazdego sentymentu
#' @param word lista slow
#' @param name nazwa zbioru(przypisana potem kolumnie wyjsciowej)
require("dplyr")
zliczone <- all_sentiment(word, all = TRUE, sent = sent)
tab <- zliczone %>%
group_by(sent) %>%
summarise(n = sum(n))%>%
mutate(procent = 100*n / sum(n))%>%
select(procent)
colnames(tab) <- c(name)
return(tab)
}
booknames <- unique(books)
names(inside) <- booknames
df_sented_list <- parallelLapply(booknames, function(x, inside){sentiment_table_mixed(unlist(inside[x]), x)}, inside = inside)
df_sented <- do.call("cbind",df_sented_list)
df_sented <- cbind(df_sented, sent= c("trust",
"fear",
"negative",
"sadness",
"anger",
"surprise",
"positive",
"disgust",
"anticipation",
"joy"))
melted <- reshape::melt(df_sented)
rpivotTable(data = melted, cols = "sent", rows = "variable", rendererName = "Heatmap", aggregatorName = "Sum", vals = "value")
Let’s see with books of Bible treated as one:
df_sented_simp <- df_sented
df_sented_simp$Bible = (df_sented$BookOfEccleasiasticus +
df_sented$BookOfProverb +
df_sented$BookOfEcclesiastes +
df_sented$BookOfWisdom) / 4
df_sented_simp$BookOfProverb <- NULL
df_sented_simp$BookOfEcclesiastes <- NULL
df_sented_simp$BookOfEccleasiasticus <- NULL
df_sented_simp$BookOfWisdom <- NULL
melted <- reshape::melt(df_sented_simp)
## Using sent as id variables
rpivotTable(data = melted, cols = "sent", rows = "variable", rendererName = "Heatmap", aggregatorName = "Sum", vals = "value")